rm(list=ls())
library(readxl)
library(tidyverse)
library(stringi)

# Create folder to receive data
dir.create(here::here("data","processed","welfare_outcomes"))

# Function to create datase with observations per muncode-year and time-variable columns 
center <-function(df,year_center){
  df <- df %>% 
    mutate(center = year - year_center) %>% 
    select(-year) %>%
    filter(between(center,-3,4)) %>% 
    mutate(center = str_replace(ifelse(center<0,paste0("t_minus",center),paste0("t_plus",center)),"-",""),
           center = str_replace(center,"t_plus0","t_0")) %>% 
    gather(variable, value, -c(mun_code,center)) %>%  
    unite(center,variable,center) %>%
    spread(center,value) %>% 
    mutate(year = year_center) 
  return(df)
}


# IDEB ------ 

# unzip files ----

zip_files <- list.files(here::here("data","raw","INEP"), pattern = "ideb",full.names = T)

dir.create(here::here("data","raw","INEP","ideb"))

map(zip_files,unzip,
    exdir = here::here("data","raw","INEP","ideb"),
    junkpaths = T)

# Prepare Primary School (1st to 5th grades) ----- 

# Read

ideb_primary_school_raw <- read_excel(path = here::here("data/raw/INEP/ideb/divulgacao_anos_iniciais_municipios2017-atualizado-Jun_2019.xlsx"),
                                            skip = 9,
                                            col_names = T,
                                            na = "-")

# Rename and filter state and mun network
ideb_primary <- ideb_primary_school_raw %>% 
  rename_all(~str_to_lower(.)) %>% 
  rename_with(~str_replace(.,"pad14","test_scores_ps"),.cols = starts_with("pad")) %>% 
  rename_with(~str_replace(.,"p14","prog_rate_ps"),.cols = starts_with("p14")) %>%
  rename_with(~str_replace(.,"p4","prog_rate_ps_17"),.cols = starts_with("p4")) %>% #Note: include the _17 to indicate year
  rename_with(~str_replace(.,"ideb14","ideb_ps"),.cols = starts_with("ideb"))  %>% 
  filter(rede == "Municipal") %>% 
  mutate(mun_code = cod_mun) %>% 
  select(mun_code,starts_with("test_scores"),starts_with("prog"),starts_with("ideb"))
  
# pivot and create year
ideb_primary_wide <- ideb_primary %>% 
  pivot_longer(cols = test_scores_ps_05:ideb_ps_17,
               names_to = "variable_name", 
               values_to = "values") %>% 
  mutate(variable = str_sub(variable_name,1,-4),
         year = as.numeric(paste0("20",str_sub(variable_name,-2)))) %>% 
  select(-variable_name) %>% 
  pivot_wider(names_from = variable,"values_from" = values) %>% 
  mutate(mun_code = str_sub(mun_code,1,-2))


# Check number of missing obsvartions
ideb_primary_wide %>%  naniar::miss_var_summary()

ideb_primary_wide %>% 
  group_by(year) %>% 
  summarise(n_obs = n())

rm(ideb_primary,ideb_primary_school_raw) #Delete unecessary objects

# CENSO ESCOLAR ----------

zip_files <- list.files(here::here("data","raw","INEP"), pattern = "sinopse",full.names = T)

dir.create(here::here("data","raw","INEP","censo_escolar"))

map(zip_files,unzip,
    exdir = here::here("data","raw","INEP","censo_escolar"),
    junkpaths = T)

# rename due to weird encoding
files_to_rename <- list.files(here::here("data","raw","INEP","censo_escolar"), pattern = "Sinopse",full.names = T)

rename_xlsx <- function(file){
  file.rename(from = file,
              to = paste0(here::here("data","raw","INEP","censo_escolar"),"/censo_escolar_sinopse_",str_sub(file,-9)))
}

map(files_to_rename,rename_xlsx)

# read data primary school

read_censo_escolar <- function(file){
  
  census_year <- as.numeric(str_extract(file,"[0-9]{4}"))
  
  # Primary School 
  enrollment <- readxl::read_excel(file,
                                   sheet = "Ensino Fundamental 1.14",
                                   skip = 10,
                                   col_names = letters[1:15]) %>% 
    select(mun_code = d,
           mun_name = c,
           enroll_ps = i) %>% 
    filter(!is.na(mun_code))
  if(census_year >= 2011){
    teachers <- readxl::read_excel(file,
                                   sheet = "Ensino Fundamental 2.16",
                                   skip = 10,
                                   col_names = letters[1:23])  
  } else {
    teachers <- readxl::read_excel(file,
                                   sheet = "Ensino Fundamental 2.13",
                                   skip = 10,
                                   col_names = letters[1:23]) 
  }
  teachers <- teachers %>% 
    select(mun_code = d,
           teachers_ps = j) %>% 
    filter(!is.na(mun_code))
  
  schools <- readxl::read_excel(file,
                                sheet = "Ensino Fundamental 3.10",
                                skip = 10,
                                col_names = letters[1:15]) %>% 
    select(mun_code = d,
           schools_ps = i) %>% 
    filter(!is.na(mun_code))
  
  classes <- readxl::read_excel(file,
                                sheet = "Ensino Fundamental 4.7",
                                skip = 10,
                                col_names = letters[1:20]) %>% 
    select(mun_code = d,
           classroom_ps = i) %>% 
    filter(!is.na(mun_code))
  
  # Child Care and Pre-School
  childc_estab <- readxl::read_excel(file,
                                sheet = "Educação Infantil 3.4",
                                skip = 10,
                                col_names = letters[1:15]) %>% 
    select(mun_code = d,
           childc_estab = i) %>% 
    filter(!is.na(mun_code))
  
  presch_estab <- readxl::read_excel(file,
                                     sheet = "Educação Infantil 3.4",
                                     skip = 10,
                                     col_names = letters[1:15]) %>% 
    select(mun_code = d,
           presch_estab = n) %>% 
    filter(!is.na(mun_code))
  
  
  censo_data <- enrollment %>% 
    full_join(teachers, by = c("mun_code")) %>% 
    full_join(schools, by = c("mun_code")) %>% 
    full_join(classes, by = c("mun_code")) %>% 
    full_join(childc_estab, by = c("mun_code")) %>% 
    full_join(presch_estab, by = c("mun_code")) %>% 
    mutate(year = census_year,
           mun_code = as.character(mun_code))
  
  return(censo_data)
}

xlsx_files <- list.files(here::here("data","raw","INEP","censo_escolar"), pattern = ".xlsx",full.names = T)

censo_escolar_data <- map_df(xlsx_files,read_censo_escolar)


# Read Population
mun_pop <- read_rds(here::here("data","processed","citycharacteristics","pop_mun.rds")) %>% 
  select(-mun_name) %>% 
  filter(between(year,2005,2016))

# Join censo and population
censo_escolar_data <- censo_escolar_data %>% 
  mutate(mun_code = str_sub(mun_code,1,-2)) %>% 
  left_join(mun_pop, by = c("mun_code","year"))

censo_escolar_data <- censo_escolar_data %>% 
  mutate(ratio_StuTea_ps = enroll_ps/teachers_ps,
         avgclass_size_ps = enroll_ps/classroom_ps,
         avgschool_size_ps = enroll_ps/schools_ps,
         teachers_100K_ps = teachers_ps/mun_pop*100000,
         schools_100K_ps = schools_ps/mun_pop*100000,
         childc_estab_100K = childc_estab/mun_pop*100000,
         presch_estab_100K = presch_estab/mun_pop*100000)

# number of mun per year
censo_escolar_data %>% group_by(year) %>% 
  summarise(n_obs = n())

# Check number of missing obsvartions

censo_escolar_data %>%  naniar::miss_var_summary()

# Create educational outcomes ---
educational_outcomes <- censo_escolar_data %>% 
  full_join(ideb_primary_wide, by = c("mun_code","year")) %>% 
  select(-mun_pop,-mun_name)



# Center educational outcomes 
educational_outcomes <- map(seq(2004,2012, by = 4), center, df = educational_outcomes) %>%
  bind_rows() %>% 
  select(mun_code,year, everything()) %>%  
  mutate_at(vars(matches("_t_")),
            .funs = funs(as.numeric(.)))

#salva

write_rds(x = educational_outcomes,path = here::here("data","processed","welfare_outcomes","educational_outcomes.rds"))

# delete folders
unlink(here::here("data","raw","INEP","censo_escolar"),
       recursive = T)

unlink(here::here("data","raw","INEP","ideb"),
       recursive = T)

